We're using standard python data tools and the imbalanced-learn contrib toolkit. The notebook includes analysis and techniques presented across multiple sources and where possible, is labeled with a link to the underlying paper, code or technique.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.datasets import make_classification
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import auc
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve
from sklearn.utils.multiclass import unique_labels
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from imblearn.pipeline import make_pipeline
from imblearn.base import BaseSampler
from imblearn.metrics import classification_report_imbalanced
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.over_sampling import SVMSMOTE
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import NearMiss
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.under_sampling import RepeatedEditedNearestNeighbours
from imblearn.under_sampling import AllKNN
from imblearn.under_sampling import TomekLinks
from imblearn.combine import SMOTEENN
from imblearn.combine import SMOTETomek
def create_dataset(n_samples=1000, weights=(0.02, 0.98), n_classes=2, class_sep=0.8, n_clusters=1):
return make_classification(n_samples=n_samples, n_features=2,
n_informative=2, n_redundant=0, n_repeated=0,
n_classes=n_classes,
n_clusters_per_class=n_clusters,
weights=list(weights),
class_sep=class_sep, random_state=0)
def plot_resampling(X, y, sampling, ax):
X_res, y_res = sampling.fit_resample(X, y)
ax.scatter(X_res[:, 0], X_res[:, 1], c=y_res, alpha=0.8, edgecolor='k')
# make nice plotting
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.get_xaxis().tick_bottom()
ax.get_yaxis().tick_left()
ax.spines['left'].set_position(('outward', 10))
ax.spines['bottom'].set_position(('outward', 10))
return Counter(y_res)
def plot_decision_function(X, y, clf, ax):
plot_step = 0.02
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
np.arange(y_min, y_max, plot_step))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
ax.contourf(xx, yy, Z, alpha=0.4)
ax.scatter(X[:, 0], X[:, 1], alpha=0.8, c=y, edgecolor='k')
https://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
def confusion(technique, df, yHat, y):
confusionMatrix = confusion_matrix(y, yHat)
classReport = classification_report_imbalanced(y, yHat)
majPrecision = confusionMatrix[0][0] / (confusionMatrix[0][0] + confusionMatrix[1][0])
majRecall = confusionMatrix[0][0] / (confusionMatrix[0][0] + confusionMatrix[0][1])
minPrecision = confusionMatrix[1][1] / (confusionMatrix[1][1] + confusionMatrix[0][1])
minRecall = confusionMatrix[1][1] / (confusionMatrix[1][1] + confusionMatrix[1][0])
df.loc[technique] = [majPrecision, majRecall, minPrecision, minRecall]
print(confusionMatrix)
print(classReport)
return confusionMatrix, classReport, (majPrecision, majRecall), (minPrecision, minRecall)
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))
ax_arr = (ax1, ax2, ax3, ax4)
weights_arr = ((0.5, 0.5), (0.75, 0.25), (0.85, 0.15), (0.95, 0.05))
for ax, weights in zip(ax_arr, weights_arr):
X, y = create_dataset(n_samples=1000, weights=weights, class_sep=0.4)
pipe = make_pipeline(LinearSVC())
pipe.fit(X, y)
plot_decision_function(X, y, pipe, ax)
ax.set_title('Linear SVC with y={}'.format(Counter(y)))
fig.tight_layout()
# Keep a summary of how well different techniques work
resultsDF = pd.DataFrame(columns=['Maj Precision', 'Maj Recall', 'Min Precision', 'Min Recall'])
fig, (ax1) = plt.subplots(1, 1, figsize=(15, 7))
X, y = create_dataset(n_samples=1000, weights=(0.95, 0.05), class_sep=0.4)
pipe = make_pipeline(LinearSVC())
pipe.fit(X, y)
plot_decision_function(X, y, pipe, ax1)
ax1.set_title('Linear SVC with y={}'.format(Counter(y)))
con, cr, majority, minority = confusion('Baseline', resultsDF, pipe.predict(X), y)
resultsDF
Achieve balance by increasing the number of minority classes which reduces the imbalance ratio.
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 7))
pipe = make_pipeline(LinearSVC())
pipe.fit(X, y)
plot_decision_function(X, y, pipe, ax1)
ax1.set_title('Linear SVC with y={}'.format(Counter(y)))
pipe = make_pipeline(RandomOverSampler(random_state=0), LinearSVC())
pipe.fit(X, y)
plot_decision_function(X, y, pipe, ax2)
ax2.set_title('Decision function for RandomOverSampler')
plot_resampling(X, y, RandomOverSampler(random_state=0), ax3)
ax3.set_title('Resampling using RandomOverSampler')
fig.tight_layout()
con, cr, majority, minority = confusion('Random Over Sampling', resultsDF, pipe.predict(X), y)
resultsDF
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 7))
pipe = make_pipeline(LinearSVC())
pipe.fit(X, y)
plot_decision_function(X, y, pipe, ax1)
ax1.set_title('Linear SVC with y={}'.format(Counter(y)))
pipe = make_pipeline(SMOTE(), LinearSVC())
pipe.fit(X, y)
plot_decision_function(X, y, pipe, ax2)
ax2.set_title('Decision function for SMOTE')
plot_resampling(X, y, SMOTE(), ax3)
ax3.set_title('Resampling using SMOTE')
fig.tight_layout()
con, cr, prec, rec = confusion('SMOTE', resultsDF, pipe.predict(X), y)
resultsDF
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 7))
pipe = make_pipeline(LinearSVC())
pipe.fit(X, y)
plot_decision_function(X, y, pipe, ax1)
ax1.set_title('Linear SVC with y={}'.format(Counter(y)))
pipe = make_pipeline(BorderlineSMOTE(random_state=0, kind='borderline-1'), LinearSVC())
pipe.fit(X, y)
plot_decision_function(X, y, pipe, ax2)
ax2.set_title('Decision function for Borderline SMOTE 1')
plot_resampling(X, y, BorderlineSMOTE(random_state=0, kind='borderline-1'), ax3)
ax3.set_title('Resampling using Borderline SMOTE 1')
fig.tight_layout()
con, cr, prec, rec = confusion('Borderline SMOTE 1', resultsDF, pipe.predict(X), y)
resultsDF
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 7))
pipe = make_pipeline(LinearSVC())
pipe.fit(X, y)
plot_decision_function(X, y, pipe, ax1)
ax1.set_title('Linear SVC with y={}'.format(Counter(y)))
pipe = make_pipeline(BorderlineSMOTE(random_state=0, kind='borderline-2'), LinearSVC())
pipe.fit(X, y)
plot_decision_function(X, y, pipe, ax2)
ax2.set_title('Decision function for Borderline SMOTE 2')
plot_resampling(X, y, BorderlineSMOTE(random_state=0, kind='borderline-2'), ax3)
ax3.set_title('Resampling using Borderline SMOTE 2')
fig.tight_layout()
con, cr, prec, rec = confusion('Borderline SMOTE 2', resultsDF, pipe.predict(X), y)
resultsDF
SVM-SMOTE focuses on generating new minority class instances near borderlines with SVM so as to help establish boundary between classes. https://medium.com/vclab/tackling-class-imbalance-with-svm-smote-efa41ec3de5f
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 7))
pipe = make_pipeline(LinearSVC())
pipe.fit(X, y)
plot_decision_function(X, y, pipe, ax1)
ax1.set_title('Linear SVC with y={}'.format(Counter(y)))
pipe = make_pipeline(SVMSMOTE(random_state=0), LinearSVC())
pipe.fit(X, y)
plot_decision_function(X, y, pipe, ax2)
ax2.set_title('Decision function for SVMSMOTE')
plot_resampling(X, y, SVMSMOTE(), ax3)
ax3.set_title('Resampling using SVMSMOTE')
fig.tight_layout()
con, cr, prec, rec = confusion('SVMSMOTE', resultsDF, pipe.predict(X), y)
resultsDF
"SMOTE and ADASYN generate new samples in by interpolation. However, the samples used to interpolate/generate new synthetic samples differ. In fact, ADASYN focuses on generating samples next to the original samples which are wrongly classified using a k-Nearest Neighbors classifier while the basic implementation of SMOTE will not make any distinction between easy and hard samples to be classified using the nearest neighbors rule. Therefore, the decision function found during training will be different among the algorithms." https://imbalanced-learn.readthedocs.io/en/stable/over_sampling.html#cbhk2002
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 7))
pipe = make_pipeline(LinearSVC())
pipe.fit(X, y)
plot_decision_function(X, y, pipe, ax1)
ax1.set_title('Linear SVC with y={}'.format(Counter(y)))
pipe = make_pipeline(ADASYN(), LinearSVC())
pipe.fit(X, y)
plot_decision_function(X, y, pipe, ax2)
ax2.set_title('Decision function for ADASYN')
plot_resampling(X, y, ADASYN(), ax3)
ax3.set_title('Resampling using ADASYN')
fig.tight_layout()
con, cr, prec, rec = confusion('ADASYN', resultsDF, pipe.predict(X), y)
resultsDF
Achieve balance by reducing the number of majority classes datapoints which reduces the imbalance ratio.
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 7))
pipe = make_pipeline(LinearSVC())
pipe.fit(X, y)
plot_decision_function(X, y, pipe, ax1)
ax1.set_title('Linear SVC with y={}'.format(Counter(y)))
pipe = make_pipeline(RandomUnderSampler(random_state=0), LinearSVC())
pipe.fit(X, y)
plot_decision_function(X, y, pipe, ax2)
ax2.set_title('Decision function for RandomUnderSampler')
plot_resampling(X, y, RandomUnderSampler(random_state=0), ax3)
ax3.set_title('Resampling using RandomUnderSampler')
fig.tight_layout()
con, cr, prec, rec = confusion('Random Under Sampling', resultsDF, pipe.predict(X), y)
resultsDF
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 7))
pipe = make_pipeline(LinearSVC())
pipe.fit(X, y)
plot_decision_function(X, y, pipe, ax1)
ax1.set_title('Linear SVC with y={}'.format(Counter(y)))
pipe = make_pipeline(TomekLinks(), LinearSVC())
pipe.fit(X, y)
plot_decision_function(X, y, pipe, ax2)
ax2.set_title('Decision function for TomekLinks')
plot_resampling(X, y, TomekLinks(), ax3)
ax3.set_title('Resampling using TomekLinks')
fig.tight_layout()
con, cr, prec, rec = confusion('TomekLinks', resultsDF, pipe.predict(X), y)
resultsDF
Near Miss algorithms implement some heuristic rules in order to select samples.
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 7))
pipe = make_pipeline(LinearSVC())
pipe.fit(X, y)
plot_decision_function(X, y, pipe, ax1)
ax1.set_title('Linear SVC with y={}'.format(Counter(y)))
pipe = make_pipeline(NearMiss(version=1), LinearSVC())
pipe.fit(X, y)
plot_decision_function(X, y, pipe, ax2)
ax2.set_title('Decision function for NearMiss-1')
plot_resampling(X, y, NearMiss(version=1), ax3)
ax3.set_title('Resampling using NearMiss-1')
fig.tight_layout()
con, cr, prec, rec = confusion('NearMiss-1', resultsDF, pipe.predict(X), y)
resultsDF
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 7))
pipe = make_pipeline(LinearSVC())
pipe.fit(X, y)
plot_decision_function(X, y, pipe, ax1)
ax1.set_title('Linear SVC with y={}'.format(Counter(y)))
pipe = make_pipeline(NearMiss(version=2), LinearSVC())
pipe.fit(X, y)
plot_decision_function(X, y, pipe, ax2)
ax2.set_title('Decision function for NearMiss-2')
plot_resampling(X, y, NearMiss(version=2), ax3)
ax3.set_title('Resampling using NearMiss-2')
fig.tight_layout()
con, cr, prec, rec = confusion('NearMiss-2', resultsDF, pipe.predict(X), y)
resultsDF
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 7))
pipe = make_pipeline(LinearSVC())
pipe.fit(X, y)
plot_decision_function(X, y, pipe, ax1)
ax1.set_title('Linear SVC with y={}'.format(Counter(y)))
pipe = make_pipeline(NearMiss(version=3), LinearSVC())
pipe.fit(X, y)
plot_decision_function(X, y, pipe, ax2)
ax2.set_title('Decision function for NearMiss-3')
plot_resampling(X, y, NearMiss(version=3), ax3)
ax3.set_title('Resampling using NearMiss-3')
fig.tight_layout()
con, cr, prec, rec = confusion('NearMiss-3', resultsDF, pipe.predict(X), y)
resultsDF
http://cgm.cs.mcgill.ca/~godfried/teaching/projects.pr.98/sergei/project.html
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 7))
pipe = make_pipeline(LinearSVC())
pipe.fit(X, y)
plot_decision_function(X, y, pipe, ax1)
ax1.set_title('Linear SVC with y={}'.format(Counter(y)))
pipe = make_pipeline(EditedNearestNeighbours(), LinearSVC())
pipe.fit(X, y)
plot_decision_function(X, y, pipe, ax2)
ax2.set_title('Decision function for Edited Nearest Neighbors')
plot_resampling(X, y, EditedNearestNeighbours(), ax3)
ax3.set_title('Resampling using EditedNearestNeighbors')
fig.tight_layout()
con, cr, prec, rec = confusion('Edited Nearest Neighbors', resultsDF, pipe.predict(X), y)
resultsDF
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 7))
pipe = make_pipeline(LinearSVC())
pipe.fit(X, y)
plot_decision_function(X, y, pipe, ax1)
ax1.set_title('Linear SVC with y={}'.format(Counter(y)))
pipe = make_pipeline(RepeatedEditedNearestNeighbours(), LinearSVC())
pipe.fit(X, y)
plot_decision_function(X, y, pipe, ax2)
ax2.set_title('Decision function for Repeated Edited Nearest Neighbors')
plot_resampling(X, y, RepeatedEditedNearestNeighbours(), ax3)
ax3.set_title('Resampling using RepeatedEditedNearestNeighbors')
fig.tight_layout()
con, cr, prec, rec = confusion('Repeated Edited Nearest Neighbors', resultsDF, pipe.predict(X), y)
resultsDF
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 7))
pipe = make_pipeline(LinearSVC())
pipe.fit(X, y)
plot_decision_function(X, y, pipe, ax1)
ax1.set_title('Linear SVC with y={}'.format(Counter(y)))
pipe = make_pipeline(AllKNN(allow_minority=True), LinearSVC())
pipe.fit(X, y)
plot_decision_function(X, y, pipe, ax2)
ax2.set_title('Decision function for ALL KNN')
plot_resampling(X, y, AllKNN(allow_minority=True), ax3)
ax3.set_title('Resampling using AllKNN')
fig.tight_layout()
con, cr, prec, rec = confusion('AllKNN', resultsDF, pipe.predict(X), y)
resultsDF
Some techniques such as XGBoost allow you to scale the weights of the minority class, achieving balance in the weight of all samples rather than in the count of samples.
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 7))
pipe = make_pipeline(LinearSVC())
pipe.fit(X, y)
plot_decision_function(X, y, pipe, ax1)
ax1.set_title('Linear SVC with y={}'.format(Counter(y)))
pipe = make_pipeline(SMOTEENN(), LinearSVC())
pipe.fit(X, y)
plot_decision_function(X, y, pipe, ax2)
ax2.set_title('Decision function for SMOTE+EEN')
plot_resampling(X, y, SMOTEENN(), ax3)
ax3.set_title('Resampling using SMOTE+EEN')
fig.tight_layout()
con, cr, prec, rec = confusion('SMOTE+ENN', resultsDF, pipe.predict(X), y)
resultsDF
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 7))
pipe = make_pipeline(LinearSVC())
pipe.fit(X, y)
plot_decision_function(X, y, pipe, ax1)
ax1.set_title('Linear SVC with y={}'.format(Counter(y)))
pipe = make_pipeline(SMOTETomek(), LinearSVC())
pipe.fit(X, y)
plot_decision_function(X, y, pipe, ax2)
ax2.set_title('Decision function for SMOTE+TomekLinks')
plot_resampling(X, y, SMOTETomek(), ax3)
ax3.set_title('Resampling using SMOTE+TomekLinks')
fig.tight_layout()
con, cr, prec, rec = confusion('SMOTE+TomekLinks', resultsDF, pipe.predict(X), y)
resultsDF
resultsDF.sort_values('Maj Precision', ascending=False).head(3)
resultsDF.sort_values('Min Recall', ascending=False).head(3)
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 7))
pipe = make_pipeline(LinearSVC())
pipe.fit(X, y)
plot_decision_function(X, y, pipe, ax1)
ax1.set_title('Linear SVC with y={}'.format(Counter(y)))
pipe = make_pipeline(SMOTE(random_state=0), LinearSVC())
pipe.fit(X, y)
plot_decision_function(X, y, pipe, ax2)
ax2.set_title('Decision function for SMOTE')
plot_resampling(X, y, SMOTE(random_state=0), ax3)
ax3.set_title('Resampling using SMOTE')
fig.tight_layout()
"ROC curves are appropriate when the observations are balanced between each class, whereas precision-recall curves are appropriate for imbalanced datasets. In both cases the area under the curve (AUC) can be used as a summary of the model performance." http://www.davidsbatista.net/blog/2018/08/19/NLP_Metrics/
Practical Guidelines
#Generate Probabilities
pipe = make_pipeline(SMOTE(kind='borderline-1'), SVC(probability=True, gamma='auto'))
pipe.fit(X, y)
probs = pipe.predict_proba(X)
probs = probs[:, 1]
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 7))
#Imbalanced ROC Curve
fpr, tpr, thresholds = roc_curve(y, probs)
ax1.plot([0, 1], [0, 1], linestyle='--')
ax1.plot(fpr, tpr, marker='.')
ax1.set_xlabel('False Positive Rate')
ax1.set_ylabel('True Positive Rate')
ax1.set_title('ROC Curve - AUC: %.3f' % roc_auc_score(y, probs))
#Precision Recall Curve
precision, recall, thresholds = precision_recall_curve(y, probs)
ax2.plot([0, 1], [0.5, 0.5], linestyle='--')
ax2.plot(recall, precision, marker='.')
ax2.set_xlabel('Recall')
ax2.set_ylabel('Precision')
ax2.set_title('Precision Recall Curve - AUC: %.3f' % auc(recall, precision))